In [1]:
import factor_analyzer
import pandas as pd
from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
%matplotlib inline 
import plotly.express as px 
In [2]:
df = pd.read_csv("tornado.csv")
df.head()
Out[2]:
yr mo dy date st mag inj fat slat slon elat elon len wid
0 1950 1 3 1/3/1950 IL 3 3 0 39.10 -89.30 39.12 -89.23 3.6 130
1 1950 1 3 1/3/1950 MO 3 3 0 38.77 -90.22 38.83 -90.03 9.5 150
2 1950 1 3 1/3/1950 OH 1 1 0 40.88 -84.58 0.00 0.00 0.1 10
3 1950 1 13 1/13/1950 AR 3 1 1 34.40 -94.37 0.00 0.00 0.6 17
4 1950 1 25 1/25/1950 IL 2 0 0 41.17 -87.33 0.00 0.00 0.1 100
In [3]:
df = df.rename(columns={"yr": "Year", "mo": "Month",'dy':'Day','date':'Date','st':'State','mag':'Magnitude',
                       'inj':'Injuries','fat':'Fatalities','slat':'StartingLatitude','slon':'StartingLatitude',
                        'elat':'EndingLatitude','elon':'EndingLongitude','len':'Length','wid':'Width'})
df.head()
Out[3]:
Year Month Day Date State Magnitude Injuries Fatalities StartingLatitude StartingLatitude EndingLatitude EndingLongitude Length Width
0 1950 1 3 1/3/1950 IL 3 3 0 39.10 -89.30 39.12 -89.23 3.6 130
1 1950 1 3 1/3/1950 MO 3 3 0 38.77 -90.22 38.83 -90.03 9.5 150
2 1950 1 3 1/3/1950 OH 1 1 0 40.88 -84.58 0.00 0.00 0.1 10
3 1950 1 13 1/13/1950 AR 3 1 1 34.40 -94.37 0.00 0.00 0.6 17
4 1950 1 25 1/25/1950 IL 2 0 0 41.17 -87.33 0.00 0.00 0.1 100
In [4]:
condition = df['Year'] <= 2006  # Define the condition
df_older = df[condition]  # Select rows that meet the condition

df = df[~condition]  # Select rows that do not meet the condition

#df_older.head()
df.head()
#df.tail()
Out[4]:
Year Month Day Date State Magnitude Injuries Fatalities StartingLatitude StartingLatitude EndingLatitude EndingLongitude Length Width
49296 2007 1 4 1/4/2007 LA 1 0 0 30.60 -91.45 30.62 -91.47 1.83 75
49297 2007 1 4 1/4/2007 LA 1 15 2 29.92 -91.80 30.05 -91.73 15.07 100
49298 2007 1 5 1/5/2007 GA 0 0 0 33.27 -84.56 33.29 -84.55 1.68 200
49299 2007 1 5 1/5/2007 GA 0 0 0 31.32 -82.47 31.34 -82.47 2.00 100
49300 2007 1 5 1/5/2007 GA 1 0 0 33.36 -84.90 33.42 -84.84 5.39 200
In [5]:
# dimensions
size=df.size
shape=df.shape
dimensions=df.ndim
info=df.info()
print(f"shape: {shape}, size: {size}, dimensions:{dimensions}, info summary: {info}")
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18262 entries, 49296 to 67557
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              18262 non-null  int64  
 1   Month             18262 non-null  int64  
 2   Day               18262 non-null  int64  
 3   Date              18262 non-null  object 
 4   State             18262 non-null  object 
 5   Magnitude         18262 non-null  int64  
 6   Injuries          18262 non-null  int64  
 7   Fatalities        18262 non-null  int64  
 8   StartingLatitude  18262 non-null  float64
 9   StartingLatitude  18262 non-null  float64
 10  EndingLatitude    18262 non-null  float64
 11  EndingLongitude   18262 non-null  float64
 12  Length            18262 non-null  float64
 13  Width             18262 non-null  int64  
dtypes: float64(5), int64(7), object(2)
memory usage: 2.1+ MB
shape: (18262, 14), size: 255668, dimensions:2, info summary: None
In [6]:
#datatypes
types=df.dtypes
types
Out[6]:
Year                  int64
Month                 int64
Day                   int64
Date                 object
State                object
Magnitude             int64
Injuries              int64
Fatalities            int64
StartingLatitude    float64
StartingLatitude    float64
EndingLatitude      float64
EndingLongitude     float64
Length              float64
Width                 int64
dtype: object
In [7]:
#checking for nulls
df.isna().sum()
Out[7]:
Year                0
Month               0
Day                 0
Date                0
State               0
Magnitude           0
Injuries            0
Fatalities          0
StartingLatitude    0
StartingLatitude    0
EndingLatitude      0
EndingLongitude     0
Length              0
Width               0
dtype: int64
In [8]:
df.describe()
Out[8]:
Year Month Day Magnitude Injuries Fatalities StartingLatitude StartingLatitude EndingLatitude EndingLongitude Length Width
count 18262.00000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000 18262.000000
mean 2013.92520 5.815135 16.500876 0.285566 0.833808 0.072226 37.161643 -92.051996 36.610713 -90.557844 3.545453 161.174242
std 4.44525 2.625002 8.749836 1.883784 16.496789 1.591000 4.811412 8.329815 6.601923 13.999551 6.528550 261.827175
min 2007.00000 1.000000 1.000000 -9.000000 0.000000 0.000000 17.721200 -159.658000 0.000000 -159.647000 0.010000 1.000000
25% 2010.00000 4.000000 9.000000 0.000000 0.000000 0.000000 33.416025 -97.688750 33.268000 -97.552550 0.380000 40.000000
50% 2014.00000 5.000000 17.000000 0.000000 0.000000 0.000000 36.897850 -92.320450 36.812950 -92.041250 1.460000 75.000000
75% 2018.00000 7.000000 24.000000 1.000000 0.000000 0.000000 40.667925 -86.430950 40.620000 -86.072300 4.110000 160.000000
max 2021.00000 12.000000 31.000000 5.000000 1500.000000 158.000000 49.330000 -64.715100 49.330000 0.000000 168.530000 4576.000000

EDA, Length¶

In [9]:
## measures of central tendency for Tornado Length
# mean
mean=round(df['Length'].mean(),3)
# median
median=df['Length'].median()
# mode
mode=df['Length'].mode()
#standard deviation
std=df['Length'].std()
#variance
variance=df['Length'].var()
#range
range=df['Length'].max()-df['Length'].min()


print(f'mean: {mean}, median: {median}, mode: {mode}, standard deviation: {std},variance: {variance}, range:{range}')
mean: 3.545, median: 1.46, mode: 0    0.1
Name: Length, dtype: float64, standard deviation: 6.528549858490544,variance: 42.62196325479691, range:168.52

Histograms¶

In [10]:
df.hist(column="Year")
#more tornadoes recorded in recent years (or more tornadoes...)
Out[10]:
array([[<AxesSubplot:title={'center':'Year'}>]], dtype=object)
In [11]:
df.hist(column="Month")
#may has most tornadoes, pretty normal distribution except for heavier december
Out[11]:
array([[<AxesSubplot:title={'center':'Month'}>]], dtype=object)
In [12]:
df.hist(column="Day")
#uniform ish but more tonadoes recorded towards end of month
Out[12]:
array([[<AxesSubplot:title={'center':'Day'}>]], dtype=object)
In [13]:
df['Magnitude'].value_counts().plot(kind='barh',title="Tornadoe Magnitude Counts, 1957-2021")
Out[13]:
<AxesSubplot:title={'center':'Tornadoe Magnitude Counts, 1957-2021'}>
In [14]:
#injuries df
injuries=df['Injuries'].value_counts()
injuries_df = pd.DataFrame(data=injuries)

#fatalities df
fatalities=df['Fatalities'].value_counts()
fatalities_df = pd.DataFrame(data=fatalities)
#no injuries was by far most common then pretty much in order (1 injury was 2nd common, then 2, then 3, etc ish)
In [15]:
#merging injuries_df & fatalities_df together
injury_fatal=injuries_df.join(fatalities_df)
injury_fatal['Fatalities'] = injury_fatal['Fatalities'].fillna(0)
injury_fatal=injury_fatal.reset_index()
injury_fatal=injury_fatal.rename(columns={"index": "Count"})

injury_fatal.head(15)
Out[15]:
Count Injuries Fatalities
0 0 17059 17962.0
1 1 413 144.0
2 2 220 60.0
3 3 103 25.0
4 4 68 15.0
5 5 59 4.0
6 6 31 9.0
7 7 30 6.0
8 8 25 6.0
9 10 23 4.0
10 9 21 4.0
11 12 15 1.0
12 20 12 1.0
13 11 11 3.0
14 15 11 0.0
In [16]:
df[['Length','Injuries','Fatalities']].describe()
Out[16]:
Length Injuries Fatalities
count 18262.000000 18262.000000 18262.000000
mean 3.545453 0.833808 0.072226
std 6.528550 16.496789 1.591000
min 0.010000 0.000000 0.000000
25% 0.380000 0.000000 0.000000
50% 1.460000 0.000000 0.000000
75% 4.110000 0.000000 0.000000
max 168.530000 1500.000000 158.000000
In [17]:
#injury_fatal.to_excel("injury_fatal.xlsx") 
In [18]:
#injury_fatal['Fatalities'].plot(x="Count", y=["Injuries", "Fatalities"], kind="bar")
In [19]:
#count of tornadoes by state
print(df['State'].value_counts())
TX    1807
KS    1245
OK    1082
MS     925
AL     923
IA     779
IL     772
MO     759
LA     746
MN     652
CO     625
NE     618
FL     607
GA     606
AR     526
TN     511
KY     460
NC     457
ND     421
IN     414
WI     372
OH     367
SD     351
SC     344
VA     259
PA     258
MI     192
WY     154
NM     151
NY     129
MD     125
CA     106
MT      75
AZ      61
CT      43
ME      37
NJ      36
ID      35
WA      33
MA      33
WV      33
OR      29
UT      23
NV      20
NH      14
DE      14
VT      10
PR       9
HI       6
RI       5
DC       2
VI       1
Name: State, dtype: int64
In [20]:
#top 20 counts by state 
df['State'].value_counts()[:20].plot(kind='barh',title="Top 20 States by Tornado Count, 1957-2021")
Out[20]:
<AxesSubplot:title={'center':'Top 20 States by Tornado Count, 1957-2021'}>
In [21]:
df.hist(column="Length")
Out[21]:
array([[<AxesSubplot:title={'center':'Length'}>]], dtype=object)
In [22]:
df.hist(column="Width")
Out[22]:
array([[<AxesSubplot:title={'center':'Width'}>]], dtype=object)
In [23]:
#animation of width and length of tornadoes
fig=px.scatter(df, x="Width", y="Length", 
           animation_frame="Year", 
          # animation_group="State", #add if we wanted to do by state averages
           color="Magnitude",     # this is like "hue" in seaborn, to group by a categorical feature
           hover_name="State",  # adds labels from a categorical feature when hover over a data point
           range_x=[0,3000],  # this sets the min and max values to show in the x-axis
           range_y=[0,250],       # this sets the min and max values to show in the y-axis
           title="Tornado Width vs. Length from 1950-2021, by Magnitude",  # add a title 
          )
fig

some observations¶

  • 2016 was a biiiig tornado year (lots of mag 3 & 4 w/o mag 0-2s
  • less insances of mag 0-2 in earlier years (more likely this is because of diferent reporting, not a lack of occurances)
  • more tornadoes in later years (more likely bc of reporting habits changing again)
  • 2016-2021 seems to have a ton of high magnitude tornadoes?? did they stop including the lower mags??
  • more diversity in width of tornado, but most are below 500
  • most tornadoes are not very long (under 50)
  • april thru july are main months for tornadoes
  • southern, dry states are where most tornadoes are (TX, KS, OK)
In [24]:
##deaths by year bar chart

deaths = df.groupby('Year')['Fatalities'].sum().reset_index()
fig = px.bar(deaths , x = 'Year', y = 'Fatalities', color = 'Fatalities', labels = {'Fatalities':'Fatalities','Year':'Year'})
            
fig.show()
In [25]:
states_fat = df.groupby(['State'])['Fatalities'].sum().reset_index()
states_fat.head()
Out[25]:
State Fatalities
0 AL 301
1 AR 68
2 AZ 0
3 CA 0
4 CO 3
In [26]:
states_inj = df.groupby(['State'])['Injuries'].sum().reset_index()
#states_inj.head()
In [27]:
#fatalities by state heatmap
fig = px.choropleth(states_fat, 
                    locations = "State", 
                    
                   locationmode="USA-states", 
                   scope="usa",
                   color = 'Fatalities' ,
                   
                   labels={'Year':'Year','State':'State','Fatalities':'Deaths'},
                   color_continuous_scale= 'YlOrBr')
                   
fig.update_layout(title = "Tornado Fatalities in States 2007-2021")
fig.show()
In [28]:
#injuries by state heatmap

fig2 = px.choropleth(states_inj, 
                    locations = "State", 
                    
                   locationmode="USA-states", 
                   scope="usa",
                   color = 'Injuries' ,
                   
                   labels={'Year':'Year','State':'State','Injuries':'Injuries'},
                   color_continuous_scale= 'YlOrBr')
                   
fig2.update_layout(title = "Tornado Injuries in States 2007-2021")
fig2.show()

some more observations¶

  • Alabama has high destruction tornados (high injury & fatality, from heatmaps)
  • 2011 was a bad year for tornados... what caused the spike in 2011?
  • tornado count seems cyclical? goes up & down in a 10-ish year cycle